Анализ временных рядов - данный метод позволяет проанализировать изменения продаж во времени и выявить цикличность или сезонность в данных.¶
Импорт библиотек
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
Чтение данных из эксель файла
In [2]:
df = pd.read_excel('Задания_1_2.xlsx')
df.Date = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)
df
Out[2]:
| series1 | |
|---|---|
| Date | |
| 2015-01-01 | 1006.699649 |
| 2015-01-02 | 3197.751826 |
| 2015-01-03 | 3217.491035 |
| 2015-01-04 | 2151.573759 |
| 2015-01-05 | 4243.929892 |
| ... | ... |
| 2019-06-26 | 4007.059387 |
| 2019-06-27 | 4836.106157 |
| 2019-06-28 | 4895.323783 |
| 2019-06-29 | 4086.016222 |
| 2019-06-30 | 3572.796793 |
1642 rows × 1 columns
Построение графика
In [3]:
fig = px.line(df, x = df.index, y = 'series1')
fig.show()
In [4]:
from statsmodels.tsa.stattools import adfuller, kpss
result = adfuller(df['series1'], regression='n', autolag='BIC')
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
ADF Statistic: -1.203086 p-value: 0.209619 Critical Values: 1%: -2.567 5%: -1.941 10%: -1.617
In [5]:
result_kpss = kpss(df['series1'], regression='ct')
print('ADF Statistic: %f' % result_kpss[0])
print('p-value: %f' % result_kpss[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
ADF Statistic: 0.181991 p-value: 0.022753 Critical Values: 1%: -2.567 5%: -1.941 10%: -1.617
In [6]:
from scipy import stats
from scipy.special import inv_boxcox
# Преобразование
df['boxcox'], lambda_value = stats.boxcox(df['series1'])
df['boxcox_shifted_S'] = df.boxcox - df.boxcox.shift(12)
df['boxcox_shifted'] = df.boxcox_shifted_S- df.boxcox_shifted_S.shift(1)
In [7]:
non_nan_values = df['boxcox_shifted'].dropna()
non_nan_values = pd.DataFrame({'non_nan_values': non_nan_values})
non_nan_values
Out[7]:
| non_nan_values | |
|---|---|
| Date | |
| 2015-01-14 | -2.144242 |
| 2015-01-15 | 0.048067 |
| 2015-01-16 | 0.641200 |
| 2015-01-17 | -1.425537 |
| 2015-01-18 | 0.012073 |
| ... | ... |
| 2019-06-26 | -0.271553 |
| 2019-06-27 | 0.890321 |
| 2019-06-28 | 0.036524 |
| 2019-06-29 | -0.878101 |
| 2019-06-30 | 0.124530 |
1629 rows × 1 columns
In [8]:
from statsmodels.tsa.stattools import adfuller, kpss
result_kpss = kpss(non_nan_values['non_nan_values'], regression='ct')
print('ADF Statistic: %f' % result_kpss[0])
print('p-value: %f' % result_kpss[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
ADF Statistic: 0.051231 p-value: 0.100000 Critical Values: 1%: -2.567 5%: -1.941 10%: -1.617
C:\Users\user\AppData\Roaming\Python\Python39\site-packages\statsmodels\tsa\stattools.py:2022: InterpolationWarning: The test statistic is outside of the range of p-values available in the look-up table. The actual p-value is greater than the p-value returned.
In [9]:
from statsmodels.tsa.stattools import adfuller, kpss
result = adfuller(non_nan_values['non_nan_values'], regression='c', autolag='AIC')
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
print(result_kpss)
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
ADF Statistic: -13.725321
p-value: 0.000000
Critical Values:
(0.05123085365144317, 0.1, 69, {'10%': 0.119, '5%': 0.146, '2.5%': 0.176, '1%': 0.216})
1%: -3.434
5%: -2.863
10%: -2.568
In [10]:
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
# Plot acf and pacf
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5), dpi=80)
plot_acf(non_nan_values)
plot_pacf(non_nan_values, lags=40, method='ywm')
ax1.tick_params(axis='both', labelsize=12)
ax2.tick_params(axis='both', labelsize=12)
plt.show()
In [11]:
train = non_nan_values.iloc[:-int(len(non_nan_values) * 0.2)]
test = non_nan_values.iloc[-int(len(non_nan_values) * 0.2):]
In [12]:
import pmdarima as pm
mymodel = pm.auto_arima(
train,
start_p = 1, start_q = 1,
max_p = 5, max_q = 5,
seasonal = True, m = 12,
trace=True,
suppress_warnings=True,
max_P = 2, max_Q = 2,
max_D = 2, max_d=2,
alpha=0.05,
test = 'kpss',
seasonal_test='ocsb',
n_fits=100,
stepwise=False,
information_criterion='bic',
out_of_sample_size=7
)
print(mymodel.summary())
ARIMA(0,0,0)(0,0,0)[12] intercept : BIC=2993.211, Time=0.19 sec
ARIMA(0,0,0)(0,0,1)[12] intercept : BIC=inf, Time=1.55 sec
ARIMA(0,0,0)(0,0,2)[12] intercept : BIC=inf, Time=4.32 sec
ARIMA(0,0,0)(1,0,0)[12] intercept : BIC=2614.715, Time=0.81 sec
ARIMA(0,0,0)(1,0,1)[12] intercept : BIC=inf, Time=1.78 sec
ARIMA(0,0,0)(1,0,2)[12] intercept : BIC=inf, Time=2.86 sec
ARIMA(0,0,0)(2,0,0)[12] intercept : BIC=2504.470, Time=1.39 sec
ARIMA(0,0,0)(2,0,1)[12] intercept : BIC=inf, Time=4.61 sec
ARIMA(0,0,0)(2,0,2)[12] intercept : BIC=inf, Time=6.13 sec
ARIMA(0,0,1)(0,0,0)[12] intercept : BIC=2647.221, Time=0.37 sec
ARIMA(0,0,1)(0,0,1)[12] intercept : BIC=inf, Time=2.80 sec
ARIMA(0,0,1)(0,0,2)[12] intercept : BIC=inf, Time=4.92 sec
ARIMA(0,0,1)(1,0,0)[12] intercept : BIC=2337.737, Time=0.70 sec
ARIMA(0,0,1)(1,0,1)[12] intercept : BIC=inf, Time=2.82 sec
ARIMA(0,0,1)(1,0,2)[12] intercept : BIC=inf, Time=6.24 sec
ARIMA(0,0,1)(2,0,0)[12] intercept : BIC=2108.279, Time=1.91 sec
ARIMA(0,0,1)(2,0,1)[12] intercept : BIC=inf, Time=5.57 sec
ARIMA(0,0,1)(2,0,2)[12] intercept : BIC=inf, Time=7.42 sec
ARIMA(0,0,2)(0,0,0)[12] intercept : BIC=inf, Time=1.15 sec
ARIMA(0,0,2)(0,0,1)[12] intercept : BIC=inf, Time=2.55 sec
ARIMA(0,0,2)(0,0,2)[12] intercept : BIC=inf, Time=5.13 sec
ARIMA(0,0,2)(1,0,0)[12] intercept : BIC=2249.351, Time=1.00 sec
ARIMA(0,0,2)(1,0,1)[12] intercept : BIC=inf, Time=3.11 sec
ARIMA(0,0,2)(1,0,2)[12] intercept : BIC=inf, Time=7.05 sec
ARIMA(0,0,2)(2,0,0)[12] intercept : BIC=2079.198, Time=2.74 sec
ARIMA(0,0,2)(2,0,1)[12] intercept : BIC=inf, Time=5.45 sec
ARIMA(0,0,3)(0,0,0)[12] intercept : BIC=inf, Time=1.86 sec
ARIMA(0,0,3)(0,0,1)[12] intercept : BIC=inf, Time=3.14 sec
ARIMA(0,0,3)(0,0,2)[12] intercept : BIC=inf, Time=7.21 sec
ARIMA(0,0,3)(1,0,0)[12] intercept : BIC=2246.229, Time=1.26 sec
ARIMA(0,0,3)(1,0,1)[12] intercept : BIC=inf, Time=5.06 sec
ARIMA(0,0,3)(2,0,0)[12] intercept : BIC=2085.636, Time=3.85 sec
ARIMA(0,0,4)(0,0,0)[12] intercept : BIC=inf, Time=3.06 sec
ARIMA(0,0,4)(0,0,1)[12] intercept : BIC=inf, Time=4.29 sec
ARIMA(0,0,4)(1,0,0)[12] intercept : BIC=2251.691, Time=1.62 sec
ARIMA(0,0,5)(0,0,0)[12] intercept : BIC=inf, Time=2.21 sec
ARIMA(1,0,0)(0,0,0)[12] intercept : BIC=2895.313, Time=0.28 sec
ARIMA(1,0,0)(0,0,1)[12] intercept : BIC=inf, Time=1.60 sec
ARIMA(1,0,0)(0,0,2)[12] intercept : BIC=inf, Time=4.79 sec
ARIMA(1,0,0)(1,0,0)[12] intercept : BIC=2507.114, Time=0.67 sec
ARIMA(1,0,0)(1,0,1)[12] intercept : BIC=inf, Time=3.17 sec
ARIMA(1,0,0)(1,0,2)[12] intercept : BIC=inf, Time=3.99 sec
ARIMA(1,0,0)(2,0,0)[12] intercept : BIC=2331.320, Time=2.14 sec
ARIMA(1,0,0)(2,0,1)[12] intercept : BIC=inf, Time=5.05 sec
ARIMA(1,0,0)(2,0,2)[12] intercept : BIC=inf, Time=5.68 sec
ARIMA(1,0,1)(0,0,0)[12] intercept : BIC=inf, Time=1.38 sec
ARIMA(1,0,1)(0,0,1)[12] intercept : BIC=inf, Time=2.98 sec
ARIMA(1,0,1)(0,0,2)[12] intercept : BIC=inf, Time=6.11 sec
ARIMA(1,0,1)(1,0,0)[12] intercept : BIC=2244.383, Time=1.37 sec
ARIMA(1,0,1)(1,0,1)[12] intercept : BIC=inf, Time=3.12 sec
ARIMA(1,0,1)(1,0,2)[12] intercept : BIC=inf, Time=7.71 sec
ARIMA(1,0,1)(2,0,0)[12] intercept : BIC=2079.557, Time=3.49 sec
ARIMA(1,0,1)(2,0,1)[12] intercept : BIC=inf, Time=5.77 sec
ARIMA(1,0,2)(0,0,0)[12] intercept : BIC=inf, Time=1.37 sec
ARIMA(1,0,2)(0,0,1)[12] intercept : BIC=inf, Time=3.28 sec
ARIMA(1,0,2)(0,0,2)[12] intercept : BIC=inf, Time=6.33 sec
ARIMA(1,0,2)(1,0,0)[12] intercept : BIC=2249.681, Time=2.36 sec
ARIMA(1,0,2)(1,0,1)[12] intercept : BIC=inf, Time=3.49 sec
ARIMA(1,0,2)(2,0,0)[12] intercept : BIC=2085.903, Time=3.66 sec
ARIMA(1,0,3)(0,0,0)[12] intercept : BIC=inf, Time=1.25 sec
ARIMA(1,0,3)(0,0,1)[12] intercept : BIC=inf, Time=2.96 sec
ARIMA(1,0,3)(1,0,0)[12] intercept : BIC=2252.894, Time=2.39 sec
ARIMA(1,0,4)(0,0,0)[12] intercept : BIC=inf, Time=2.33 sec
ARIMA(2,0,0)(0,0,0)[12] intercept : BIC=2745.267, Time=0.31 sec
ARIMA(2,0,0)(0,0,1)[12] intercept : BIC=inf, Time=3.19 sec
ARIMA(2,0,0)(0,0,2)[12] intercept : BIC=inf, Time=16.05 sec
ARIMA(2,0,0)(1,0,0)[12] intercept : BIC=2460.390, Time=1.12 sec
ARIMA(2,0,0)(1,0,1)[12] intercept : BIC=inf, Time=3.74 sec
ARIMA(2,0,0)(1,0,2)[12] intercept : BIC=inf, Time=7.49 sec
ARIMA(2,0,0)(2,0,0)[12] intercept : BIC=2257.867, Time=6.58 sec
ARIMA(2,0,0)(2,0,1)[12] intercept : BIC=inf, Time=8.97 sec
ARIMA(2,0,1)(0,0,0)[12] intercept : BIC=inf, Time=1.51 sec
ARIMA(2,0,1)(0,0,1)[12] intercept : BIC=inf, Time=4.61 sec
ARIMA(2,0,1)(0,0,2)[12] intercept : BIC=inf, Time=7.17 sec
ARIMA(2,0,1)(1,0,0)[12] intercept : BIC=2248.408, Time=1.74 sec
ARIMA(2,0,1)(1,0,1)[12] intercept : BIC=inf, Time=4.21 sec
ARIMA(2,0,1)(2,0,0)[12] intercept : BIC=2085.424, Time=5.29 sec
ARIMA(2,0,2)(0,0,0)[12] intercept : BIC=inf, Time=2.09 sec
ARIMA(2,0,2)(0,0,1)[12] intercept : BIC=inf, Time=4.06 sec
ARIMA(2,0,2)(1,0,0)[12] intercept : BIC=inf, Time=4.56 sec
ARIMA(2,0,3)(0,0,0)[12] intercept : BIC=inf, Time=2.60 sec
ARIMA(3,0,0)(0,0,0)[12] intercept : BIC=2700.759, Time=0.63 sec
ARIMA(3,0,0)(0,0,1)[12] intercept : BIC=inf, Time=3.23 sec
ARIMA(3,0,0)(0,0,2)[12] intercept : BIC=inf, Time=6.12 sec
ARIMA(3,0,0)(1,0,0)[12] intercept : BIC=2409.244, Time=1.26 sec
ARIMA(3,0,0)(1,0,1)[12] intercept : BIC=inf, Time=4.02 sec
ARIMA(3,0,0)(2,0,0)[12] intercept : BIC=2212.589, Time=2.76 sec
ARIMA(3,0,1)(0,0,0)[12] intercept : BIC=inf, Time=2.04 sec
ARIMA(3,0,1)(0,0,1)[12] intercept : BIC=inf, Time=3.60 sec
ARIMA(3,0,1)(1,0,0)[12] intercept : BIC=2237.885, Time=2.00 sec
ARIMA(3,0,2)(0,0,0)[12] intercept : BIC=inf, Time=2.44 sec
ARIMA(4,0,0)(0,0,0)[12] intercept : BIC=2692.358, Time=0.52 sec
ARIMA(4,0,0)(0,0,1)[12] intercept : BIC=inf, Time=3.09 sec
ARIMA(4,0,0)(1,0,0)[12] intercept : BIC=2347.557, Time=1.22 sec
ARIMA(4,0,1)(0,0,0)[12] intercept : BIC=inf, Time=2.40 sec
ARIMA(5,0,0)(0,0,0)[12] intercept : BIC=2563.931, Time=0.75 sec
Best model: ARIMA(0,0,2)(2,0,0)[12] intercept
Total fit time: 329.427 seconds
SARIMAX Results
===========================================================================================
Dep. Variable: y No. Observations: 1304
Model: SARIMAX(0, 0, 2)x(2, 0, [], 12) Log Likelihood -1018.079
Date: Tue, 11 Jun 2024 AIC 2048.159
Time: 18:13:18 BIC 2079.198
Sample: 0 HQIC 2059.803
- 1304
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
intercept -0.0002 0.003 -0.050 0.960 -0.007 0.007
ma.L1 -0.6316 0.018 -34.407 0.000 -0.668 -0.596
ma.L2 -0.1671 0.020 -8.241 0.000 -0.207 -0.127
ar.S.L12 -0.6461 0.020 -31.989 0.000 -0.686 -0.606
ar.S.L24 -0.3713 0.018 -20.690 0.000 -0.406 -0.336
sigma2 0.2770 0.005 52.297 0.000 0.267 0.287
===================================================================================
Ljung-Box (L1) (Q): 0.06 Jarque-Bera (JB): 5419.41
Prob(Q): 0.81 Prob(JB): 0.00
Heteroskedasticity (H): 0.62 Skew: -1.23
Prob(H) (two-sided): 0.00 Kurtosis: 12.68
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
In [13]:
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(train, order=(0,0,2),
seasonal_order=(2,0,0,12)).fit()
forecasts = model.forecast(len(test))
forecasts
C:\Users\user\AppData\Roaming\Python\Python39\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency D will be used. C:\Users\user\AppData\Roaming\Python\Python39\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency D will be used. C:\Users\user\AppData\Roaming\Python\Python39\site-packages\statsmodels\tsa\base\tsa_model.py:471: ValueWarning: No frequency information was provided, so inferred frequency D will be used.
Out[13]:
2018-08-10 0.206257
2018-08-11 -0.853199
2018-08-12 -0.121371
2018-08-13 -0.227366
2018-08-14 -0.013188
...
2019-06-26 -0.000073
2019-06-27 -0.000072
2019-06-28 -0.000073
2019-06-29 -0.000071
2019-06-30 -0.000072
Freq: D, Name: predicted_mean, Length: 325, dtype: float64
In [14]:
data_forecasts = pd.DataFrame({'forecasts': forecasts})
data_forecasts
Out[14]:
| forecasts | |
|---|---|
| 2018-08-10 | 0.206257 |
| 2018-08-11 | -0.853199 |
| 2018-08-12 | -0.121371 |
| 2018-08-13 | -0.227366 |
| 2018-08-14 | -0.013188 |
| ... | ... |
| 2019-06-26 | -0.000073 |
| 2019-06-27 | -0.000072 |
| 2019-06-28 | -0.000073 |
| 2019-06-29 | -0.000071 |
| 2019-06-30 | -0.000072 |
325 rows × 1 columns
In [15]:
from scipy.special import inv_boxcox
full_df = pd.merge(df,data_forecasts, left_on = df.index, right_on = data_forecasts.index , how = 'left' )
full_df
Out[15]:
| key_0 | series1 | boxcox | boxcox_shifted_S | boxcox_shifted | forecasts | |
|---|---|---|---|---|---|---|
| 0 | 2015-01-01 | 1006.699649 | 9.427615 | NaN | NaN | NaN |
| 1 | 2015-01-02 | 3197.751826 | 11.621358 | NaN | NaN | NaN |
| 2 | 2015-01-03 | 3217.491035 | 11.633629 | NaN | NaN | NaN |
| 3 | 2015-01-04 | 2151.573759 | 10.844711 | NaN | NaN | NaN |
| 4 | 2015-01-05 | 4243.929892 | 12.192449 | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... |
| 1637 | 2019-06-26 | 4007.059387 | 12.075449 | -0.228174 | -0.271553 | -0.000073 |
| 1638 | 2019-06-27 | 4836.106157 | 12.460696 | 0.662147 | 0.890321 | -0.000072 |
| 1639 | 2019-06-28 | 4895.323783 | 12.485843 | 0.698671 | 0.036524 | -0.000073 |
| 1640 | 2019-06-29 | 4086.016222 | 12.115136 | -0.179430 | -0.878101 | -0.000071 |
| 1641 | 2019-06-30 | 3572.796793 | 11.843477 | -0.054900 | 0.124530 | -0.000072 |
1642 rows × 6 columns
In [16]:
full_df['boxcox_shifted_S'] = full_df.forecasts + full_df.boxcox_shifted_S.shift(1)
full_df['boxcox'] = full_df.boxcox_shifted_S + full_df.boxcox.shift(12)
full_df['beer'] = inv_boxcox(full_df.boxcox, lambda_value)
In [17]:
predict = full_df['beer'].dropna()
predict
Out[17]:
1317 3350.744376
1318 4780.633711
1319 3739.578494
1320 2928.890317
1321 4740.969386
...
1637 4576.297157
1638 3116.802369
1639 4809.361048
1640 6241.050493
1641 3357.733594
Name: beer, Length: 325, dtype: float64
In [18]:
train_df = df.iloc[:-int(len(df) * 0.2)]
test_df = df.iloc[-int(len(df) * 0.2):]
In [19]:
# Import packages
import plotly.graph_objects as go
def plot_forecasts(forecasts: list[float], title: str) -> None:
"""Function to plot the forecasts."""
fig = go.Figure()
fig.add_trace(go.Scatter(x=train_df.index, y=train_df['series1'], name='Train'))
fig.add_trace(go.Scatter(x=test_df.index, y=test_df['series1'], name='Test'))
fig.add_trace(go.Scatter(x=test_df.index, y=predict, name='Forecast'))
fig.update_layout(template="simple_white", font=dict(size=18), title_text=title,
width=650, title_x=0.5, height=400, xaxis_title='Date')
return fig.show()
# Plot the forecasts
plot_forecasts(forecasts, 'ARIMA')
In [20]:
res = test_df.iloc[:len(predict)]['series1']
res
Out[20]:
Date
2018-08-07 4322.886728
2018-08-08 4540.018025
2018-08-09 4224.190684
2018-08-10 3868.884925
2018-08-11 3691.232046
...
2019-06-23 3375.404705
2019-06-24 4401.843563
2019-06-25 3710.971255
2019-06-26 4007.059387
2019-06-27 4836.106157
Name: series1, Length: 325, dtype: float64
In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Вычисление средней абсолютной ошибки (MAE)
mae = mean_absolute_error(res, predict)
print("MAE:", mae)
# Вычисление среднеквадратичной ошибки (MSE)
mse = mean_squared_error(res, predict)
print("MSE:", mse)
MAE: 869.6450418707591 MSE: 1665688.4396404284
In [ ]: